What is the distribution of CTCF ChIP peaks from K562 lines in following genomic regions: tss -> exons -> introns -> intergenic regions.
Load the packages
library(genomation)
library(rtracklayer)
library(GenomicRanges)
Load the files
ctcf = readGeneric('Data/CTCF_K562_hg38.chr21.bed', zero.based=TRUE)
gtf = import.gff('./Data/hg38_EnsemblGenes.chr21.gtf.gz')
Construct the annotation
tss = promoters(subset(gtf, type=='gene'), 1000, 1000)
exon = subset(gtf, type=='exon')
intron = subset(gtf, type=='gene')
annotation_list = list(tss = tss, exon=exon, intron=intron)
annotation_list = GRangesList(annotation_list)
Find the overlaps between the annotation and peaks
ovlaps = as.data.frame(findOverlaps(ctcf, annotation_list))
ovlaps$annotation = names(annotation_list)[ovlaps$subjectHits]
ovlaps = ovlaps[order(ovlaps$subjectHits),]
ovlaps = ovlaps[!duplicated(ovlaps$queryHits),]
Creates the annotation vector and outputs the statistics
annot_vector = rep('intergenic', length(ctcf))
annot_vector[ovlaps$queryHits] = ovlaps$annotation
table(annot_vector)
## annot_vector
## exon intergenic intron tss
## 381 1041 1663 380
library(genomation)
library(GenomicRanges)
library(dplyr)
library(ggplot2)
library(ComplexHeatmap)
chip.files = list.files('./Data/ChIP', full.names=TRUE)
lchip = lapply(chip.files, readGeneric, zero.based=TRUE)
lnams = basename(chip.files)
lnams = sub('_GRCh38.bed.gz','',lnams)
lnams = sub('K562_','',lnams)
names(lchip) = lnams
lchip = GRangesList(lchip)
lchip = reduce(lchip)
uchip = unlist(lchip)
data.frame(sample = names(uchip), width = width(uchip)) %>%
mutate(MAX = case_when(
sample == 'MAX' ~ 'MAX',
sample == 'TAL1' ~ 'TAL1',
sample == 'FOXA1' ~ 'FOXA1',
sample == 'ZNF639' ~ 'ZNF639',
TRUE ~ 'Other'
)) %>%
ggplot(aes(width, color=MAX)) +
geom_density(size=1) +
scale_color_manual(values=c('darkorange','darkorange1','black','darkorange2','darkorange3','darkorange4')) +
scale_x_log10() +
theme_bw()
We’ll throw them out
lchip = lchip[!names(lchip) %in% c('MAX','TAL1','FOXA1','ZNF639')]
uchip = unlist(lchip)
ovlap = as.data.frame(findOverlaps(uchip, lchip))
ovlap$set1 = names(uchip)[ovlap$queryHits]
ovlap$set2 = names(lchip)[ovlap$subjectHits]
inter = ovlap %>%
group_by(set1, set2) %>%
summarize(intersection = length(unique(queryHits)))
inter$key = with(inter, paste(set1, set2))
union = expand.grid(seq(lchip), seq(lchip))
union$set1 = names(lchip)[union$Var1]
union$set2 = names(lchip)[union$Var2]
union$length1 = elementNROWS(lchip)[union$Var1]
union$length2 = elementNROWS(lchip)[union$Var2]
union$sum_length = with(union, (length1 + length2))
union$key = with(union, paste(set1, set2))
union = union[,-c(1:4)]
dist = merge(inter, union, by='key')
dist$perc = with(dist, round(intersection/length1,2))
dist$union = with(dist, sum_length - intersection)
dist$jacc = with(dist, intersection/union)
Because the peak overlap is not a symmetric measure, we average the approximate jaccard index between TFA - TFB, and TFB - TFA
mat.jacc = data.table::dcast(dist, set1~set2, value.var='jacc',fill=0) %>%
mutate(set1 = NULL)
rownames(mat.jacc) = colnames(mat.jacc)
mat.jacc = (mat.jacc + t(mat.jacc))/2
Heatmap(mat.jacc, col=circlize::colorRamp2(breaks = c(0,1),colors=c('white','red')))
Because the percentage of overlap is not a symmetric measure, we average the approximate jaccard index between TFA - TFB, and TFB - TFA
mat.perc = data.table::dcast(dist, set1~set2, value.var='perc',fill=0) %>%
mutate(set1 = NULL)
rownames(mat.perc) = colnames(mat.perc)
mat.perc = (mat.perc + t(mat.perc))/2
Heatmap(mat.perc, col=circlize::colorRamp2(breaks = c(0,1),colors=c('white','red')))